{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据形状: (614872, 10)\n",
      "列名: ['concept_id', 'concept_name', 'domain_id', 'vocabulary_id', 'concept_class_id', 'standard_concept', 'concept_code', 'valid_start_date', 'valid_end_date', 'invalid_reason']\n",
      "\n",
      "随机5行数据:\n",
      "        concept_id  \\\n",
      "586896    45767111   \n",
      "68710     41950966   \n",
      "33025     37173249   \n",
      "11232     46286891   \n",
      "552514     4122186   \n",
      "\n",
      "                                                                                                                                       concept_name  \\\n",
      "586896                                                                                                                 At risk of coping impairment   \n",
      "68710                                                                                                                                    Saint-Piat   \n",
      "33025                                                                                          Protein/creatinine mass concentration ratio in urine   \n",
      "11232   HoNOSCA-CR (Health of the Nation Outcome Scales for Children and Adolescents - clinician-rated) scale 11 score - self care and independence   \n",
      "552514                                                                                                                 Chronic osteomyelitis of hip   \n",
      "\n",
      "          domain_id vocabulary_id   concept_class_id standard_concept  \\\n",
      "586896  Observation        SNOMED   Clinical Finding                S   \n",
      "68710     Geography           OSM          8th level                S   \n",
      "33025   Measurement        SNOMED  Observable Entity                S   \n",
      "11232   Measurement        SNOMED  Observable Entity              NaN   \n",
      "552514    Condition        SNOMED           Disorder                S   \n",
      "\n",
      "           concept_code  valid_start_date  valid_end_date  invalid_reason  \n",
      "586896        704421005          20150131        20991231             NaN  \n",
      "68710           1583966          19700101        20991231             NaN  \n",
      "33025    58201000237103          20230607        20991231             NaN  \n",
      "11232   989771000000106          20151001        20991231             NaN  \n",
      "552514        287055001          20020131        20991231             NaN  \n"
     ]
    }
   ],
   "source": [
    "# display the snomed file\n",
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv('/home/huangj2/Documents/RAG_P2_医疗名词/01.standardization/data/SNOMED-CT/SNOMED_valid_comma.csv',\n",
    "                low_memory=False\n",
    "                )\n",
    "\n",
    "# 显示数据的基本信息\n",
    "print(\"数据形状:\", df.shape)\n",
    "print(\"列名:\", df.columns.tolist())\n",
    "\n",
    "# 随机展示5行数据的完整内容\n",
    "print(\"\\n随机5行数据:\")\n",
    "pd.set_option('display.max_columns', None)  # 显示所有列\n",
    "pd.set_option('display.width', None)  # 显示所有内容不截断\n",
    "pd.set_option('display.max_colwidth', None)  # 显示每列的完整内容\n",
    "print(df.sample(5))  # 使用sample方法随机抽取5行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "数据类型和非空值统计:\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 614872 entries, 0 to 614871\n",
      "Data columns (total 10 columns):\n",
      " #   Column            Non-Null Count   Dtype  \n",
      "---  ------            --------------   -----  \n",
      " 0   concept_id        614872 non-null  int64  \n",
      " 1   concept_name      614870 non-null  object \n",
      " 2   domain_id         614872 non-null  object \n",
      " 3   vocabulary_id     614871 non-null  object \n",
      " 4   concept_class_id  614872 non-null  object \n",
      " 5   standard_concept  547516 non-null  object \n",
      " 6   concept_code      614872 non-null  object \n",
      " 7   valid_start_date  614872 non-null  int64  \n",
      " 8   valid_end_date    614872 non-null  int64  \n",
      " 9   invalid_reason    0 non-null       float64\n",
      "dtypes: float64(1), int64(3), object(6)\n",
      "memory usage: 46.9+ MB\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "# 显示每列的数据类型和非空值数量\n",
    "print(\"\\n数据类型和非空值统计:\")\n",
    "print(df.info())\n",
    "\n",
    "# 显示数值列的统计摘要\n",
    "# print(\"\\n数值列统计摘要:\")\n",
    "# print(df.describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "domain_id 和 concept_class_id 的列表:\n",
      "['Condition' 'Metadata' 'Condition Status' 'Type Concept' 'Cost' 'Visit'\n",
      " 'Unit' 'Episode' 'Revenue Code' 'Language' 'Relationship' 'Observation'\n",
      " 'Procedure' 'Measurement' 'Drug' 'Spec Anatomic Site' 'Specimen'\n",
      " 'Meas Value' 'Route' 'Device' 'Geography' 'Race' 'Provider' 'Plan'\n",
      " 'Plan Stop Reason' 'Meas Value Operator' 'Gender' 'Payer' 'Sponsor']\n",
      "['Clinical Finding' 'Disorder' 'Field' 'Table' 'CDM' 'Concept Class'\n",
      " 'Condition Status' 'Death Type' 'Condition Type' 'Detail' 'Summary'\n",
      " 'Cost Type' 'Device Type' 'Domain' 'Drug Type' 'Vocabulary' 'Visit'\n",
      " 'Unit' 'Episode of Care' 'Treatment' 'Disease Extent' 'Disease Dynamic'\n",
      " 'Revenue Code' 'Language' 'Relationship' 'Procedure' 'Observable Entity'\n",
      " 'Substance' 'Location' 'Body Structure' 'Specimen' 'Qualifier Value'\n",
      " 'Physical Object' 'Organism' 'Pharma/Biol Product' 'Model Comp'\n",
      " 'Social Context' 'Morph Abnormality' 'Dose Form' 'Context-dependent'\n",
      " 'Clinical Drug' 'Event' 'Clinical Drug Form' 'Disposition'\n",
      " 'Staging / Scales' 'Record Artifact' 'Namespace Concept' 'Physical Force'\n",
      " 'Attribute' 'Meas Type' 'Metadata' 'Undefined' 'Note Type' '9th level'\n",
      " '8th level' '10th level' '6th level' '7th level' '4th level' '11th level'\n",
      " '5th level' '3rd level' '2nd level' '12th level' 'Obs Period Type'\n",
      " 'Observation Type' 'Benefit' 'Metal level' 'Plan Stop Reason'\n",
      " 'Procedure Type' 'Navi Concept' 'Special Concept' 'Payer' 'Sponsor'\n",
      " 'Type Concept' 'UB04 Point of Origin' 'UB04 Pri Typ of Adm'\n",
      " 'UB04 Pt dis status' 'Observation' 'Canonical Unit' 'US Census Region'\n",
      " 'US Census Division' 'Visit Type' 'Linkage Concept' 'Linkage Assertion']\n"
     ]
    }
   ],
   "source": [
    "# domain_id 和 concept_class_id - 展示一下列表\n",
    "print(\"\\ndomain_id 和 concept_class_id 的列表:\")\n",
    "print(df['domain_id'].unique())\n",
    "print(df['concept_class_id'].unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "包含'Dyspnea'的概念:\n",
      "找到 24 条包含'Dyspnea'的记录\n"
     ]
    },
    {
     "ename": "KeyError",
     "evalue": "\"['Synonyms'] not in index\"",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[3], line 5\u001b[0m\n\u001b[1;32m      3\u001b[0m dyspnea_concepts \u001b[38;5;241m=\u001b[39m df[df[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mconcept_name\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mcontains(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDyspnea\u001b[39m\u001b[38;5;124m'\u001b[39m, case\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, na\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)]\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m找到 \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(dyspnea_concepts)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m 条包含\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDyspnea\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m的记录\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mdyspnea_concepts\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mconcept_code\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mconcept_name\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mSynonyms\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m)\n",
      "File \u001b[0;32m/mnt/external_disk/venv/20250203_SimplePython/lib/python3.10/site-packages/pandas/core/frame.py:4108\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m   4106\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m   4107\u001b[0m         key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 4108\u001b[0m     indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcolumns\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m   4110\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m   4111\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n",
      "File \u001b[0;32m/mnt/external_disk/venv/20250203_SimplePython/lib/python3.10/site-packages/pandas/core/indexes/base.py:6200\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[0;34m(self, key, axis_name)\u001b[0m\n\u001b[1;32m   6197\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   6198\u001b[0m     keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 6200\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   6202\u001b[0m keyarr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indexer)\n\u001b[1;32m   6203\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[1;32m   6204\u001b[0m     \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
      "File \u001b[0;32m/mnt/external_disk/venv/20250203_SimplePython/lib/python3.10/site-packages/pandas/core/indexes/base.py:6252\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[0;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[1;32m   6249\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m   6251\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[38;5;241m.\u001b[39mnonzero()[\u001b[38;5;241m0\u001b[39m]]\u001b[38;5;241m.\u001b[39munique())\n\u001b[0;32m-> 6252\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "\u001b[0;31mKeyError\u001b[0m: \"['Synonyms'] not in index\""
     ]
    }
   ],
   "source": [
    "# 检索包含\"Dyspnea\"的概念名称\n",
    "print(\"\\n包含'Dyspnea'的概念:\")\n",
    "dyspnea_concepts = df[df['concept_name'].str.contains('Dyspnea', case=False, na=False)]\n",
    "print(f\"找到 {len(dyspnea_concepts)} 条包含'Dyspnea'的记录\")\n",
    "print(dyspnea_concepts[['concept_code', 'concept_name', 'Synonyms']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 321341 - 拿这个索引号， 第321341行\n",
    "print(\"\\n检索第321341行的数据:\")\n",
    "print(df.iloc[321341])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 尝试使用模糊匹配\n",
    "print(\"\\n尝试模糊匹配:\")\n",
    "fuzzy_matches = df[df['concept_code'].str.contains('267036', na=False)]\n",
    "if len(fuzzy_matches) > 0:\n",
    "    print(\"找到的模糊匹配:\")\n",
    "    print(fuzzy_matches[['concept_code', 'concept_name']])\n",
    "\n",
    "# 检查Dyspnea相关的行\n",
    "print(\"\\n检查Dyspnea相关的行:\")\n",
    "dyspnea_rows = df[df['concept_name'].str.contains('Dyspnea', case=False, na=False)]\n",
    "print(dyspnea_rows[['concept_code', 'concept_name']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (20250203_SimplePython)",
   "language": "python",
   "name": "20250203_simplepython"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
